This is the next step after creating the required dataset for the project. To look into the data, find the anomalies, outliers, Not available values, and prepare it for the modeling
load("validation_test.rda")
head(dataset, n=10)
## amb_temp lat lon population LST Day_population imprev
## 2 -3.3 41.85814 -87.61606 2515 3.9 3009 0.5169038
## 3 -7.2 41.81034 -87.59023 1654 1.1 5496 0.9253708
## 6 -1.1 41.85218 -87.67583 1436 2.2 2275 0.3241619
## 7 0.0 41.73631 -87.62418 3971 2.8 4864 0.2578571
## 8 0.0 41.76832 -87.68340 2878 5.0 4866 0.2573717
## 9 0.0 41.85780 -87.68581 3920 3.3 6008 0.4224022
## 10 -7.2 41.72246 -87.57535 3985 1.7 2490 0.8199707
## 11 5.6 41.73649 -87.61453 3400 12.2 1007 0.5233647
## 12 -3.3 41.96509 -87.67908 2898 4.4 1505 0.7719977
## 14 -6.7 41.83258 -87.64613 2251 0.6 1105 0.4883683
## Land_Cover
## 2 10
## 3 3
## 6 6
## 7 7
## 8 3
## 9 9
## 10 4
## 11 10
## 12 2
## 14 2
summary(dataset)
## amb_temp lat lon population
## Min. :-11.100 Min. :41.69 Min. :-87.76 Min. : 786
## 1st Qu.: -8.175 1st Qu.:41.79 1st Qu.:-87.68 1st Qu.:1700
## Median : -5.850 Median :41.88 Median :-87.66 Median :2544
## Mean : -4.679 Mean :41.85 Mean :-87.66 Mean :2956
## 3rd Qu.: -1.108 3rd Qu.:41.91 3rd Qu.:-87.63 3rd Qu.:3982
## Max. : 5.600 Max. :41.97 Max. :-87.54 Max. :7868
##
## LST Day_population imprev Land_Cover
## Min. :-5.600 Min. : 676 Min. :0.1071 10 :12
## 1st Qu.:-0.600 1st Qu.:1887 1st Qu.:0.3879 9 :10
## Median : 1.400 Median :2786 Median :0.5412 4 : 8
## Mean : 1.158 Mean :3103 Mean :0.5421 11 : 7
## 3rd Qu.: 3.300 3rd Qu.:4144 3rd Qu.:0.7291 6 : 7
## Max. :12.200 Max. :7796 Max. :0.9254 7 : 7
## (Other):23
Let’s look into Boxplots of comparison of similar variables:
library(ggplot2)
library(plotly)
library(tidyverse)
# tidy the data for visualization
data.tidy <- dataset[,1:7] %>% gather(key = "parameter", value = "value", -c(lat, lon))
bxp.tmeps <- ggplot(data.tidy, aes(x=parameter, y=value))+
geom_boxplot()+facet_wrap(~parameter, scales = "free_y")
bxp.tmeps <- ggplotly(bxp.tmeps)
bxp.tmeps
Looking into the histograms of data
hist.temps <- ggplot(data.tidy, aes(value))+
geom_histogram()+facet_wrap(~parameter, scales = "free_x")
h <- ggplotly(hist.temps)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
h
Now let’s take a look into the correlations between the variables
library(corrplot)
data_to_corr <- dataset[, -c(2,3)]
data_to_corr$Land_Cover <- as.numeric(data_to_corr$Land_Cover)
correlations <- cor(data_to_corr)
corrplot(correlations, method="circle")
Now should consider feature selection and possibly to remove the most correlated attributes.